#Importing the basic librarires
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
#from brokenaxes import brokenaxes
from statsmodels.formula import api
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10,6]
import warnings
warnings.filterwarnings('ignore')
import csv
df=pd.read_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/Walmart-Sales.csv')
df.head(5)
| Store | Date | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 5/2/10 | 1643690.90 | 0 | 42.31 | 2.572 | 211.096358 | 8.106 |
| 1 | 1 | 12/2/10 | 1641957.44 | 1 | 38.51 | 2.548 | 211.242170 | 8.106 |
| 2 | 1 | 19-02-2010 | 1611968.17 | 0 | 39.93 | 2.514 | 211.289143 | 8.106 |
| 3 | 1 | 26-02-2010 | 1409727.59 | 0 | 46.63 | 2.561 | 211.319643 | 8.106 |
| 4 | 1 | 5/3/10 | 1554806.68 | 0 | 46.50 | 2.625 | 211.350143 | 8.106 |
# Iterate over the columns in the DataFrame
for column in df.columns:
# Check the data type of the column
if df[column].dtype == 'object':
# If the data type is object, the variable is categorical
print(column, 'is a categorical variable')
else:
# If the data type is not object, the variable is continuous
print(column, 'is a continuous variable')
Store is a continuous variable Date is a categorical variable Weekly_Sales is a continuous variable Holiday_Flag is a continuous variable Temperature is a continuous variable Fuel_Price is a continuous variable CPI is a continuous variable Unemployment is a continuous variable
num_rows = len(df)
print('Number of rows:', num_rows)
print('Number of columns:', df.shape[1])
df.columns
df.info()
Number of rows: 7008 Number of columns: 8 <class 'pandas.core.frame.DataFrame'> RangeIndex: 7008 entries, 0 to 7007 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store 7008 non-null int64 1 Date 7008 non-null object 2 Weekly_Sales 7008 non-null float64 3 Holiday_Flag 7008 non-null int64 4 Temperature 7008 non-null float64 5 Fuel_Price 7008 non-null float64 6 CPI 7008 non-null float64 7 Unemployment 7008 non-null float64 dtypes: float64(5), int64(2), object(1) memory usage: 438.1+ KB
print(df.duplicated())
# remove duplicates and keep the first occurrence
df = df.drop_duplicates()
# print the cleaned dataframe
print(df)
num_rows = len(df)
print('Number of rows:', num_rows)
# get the number of missing rows
num_missing_rows = df.isnull().any(axis=1).sum()
print('Number of missing rows:', num_missing_rows)
# get the number of missing columns
num_missing_cols = df.isnull().any(axis=0).sum()
print('Number of missing columns:', num_missing_cols)
0 False
1 False
2 False
3 False
4 False
...
7003 True
7004 True
7005 True
7006 True
7007 True
Length: 7008, dtype: bool
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price \
0 1 5/2/10 1643690.90 0 42.31 2.572
1 1 12/2/10 1641957.44 1 38.51 2.548
2 1 19-02-2010 1611968.17 0 39.93 2.514
3 1 26-02-2010 1409727.59 0 46.63 2.561
4 1 5/3/10 1554806.68 0 46.50 2.625
... ... ... ... ... ... ...
6430 45 28-09-2012 713173.95 0 64.88 3.997
6431 45 5/10/12 733455.07 0 64.89 3.985
6432 45 12/10/12 734464.36 0 54.47 4.000
6433 45 19-10-2012 718125.53 0 56.47 3.969
6434 45 26-10-2012 760281.43 0 58.85 3.882
CPI Unemployment
0 211.096358 8.106
1 211.242170 8.106
2 211.289143 8.106
3 211.319643 8.106
4 211.350143 8.106
... ... ...
6430 192.013558 8.684
6431 192.170412 8.667
6432 192.327265 8.667
6433 192.330854 8.667
6434 192.308899 8.667
[6435 rows x 8 columns]
Number of rows: 6435
Number of missing rows: 0
Number of missing columns: 0
from datetime import datetime
def convert_date(date_string):
try:
date = datetime.strptime(date_string, "%d/%m/%y")
except ValueError:
date = datetime.strptime(date_string, "%d-%m-%Y")
return date.strftime("%d-%m-%Y")
# apply conversion function to Date column
df["Date"] = df["Date"].apply(convert_date)
print(df)
Store Date Weekly_Sales Holiday_Flag Temperature Fuel_Price \
0 1 05-02-2010 1643690.90 0 42.31 2.572
1 1 12-02-2010 1641957.44 1 38.51 2.548
2 1 19-02-2010 1611968.17 0 39.93 2.514
3 1 26-02-2010 1409727.59 0 46.63 2.561
4 1 05-03-2010 1554806.68 0 46.50 2.625
... ... ... ... ... ... ...
6430 45 28-09-2012 713173.95 0 64.88 3.997
6431 45 05-10-2012 733455.07 0 64.89 3.985
6432 45 12-10-2012 734464.36 0 54.47 4.000
6433 45 19-10-2012 718125.53 0 56.47 3.969
6434 45 26-10-2012 760281.43 0 58.85 3.882
CPI Unemployment
0 211.096358 8.106
1 211.242170 8.106
2 211.289143 8.106
3 211.319643 8.106
4 211.350143 8.106
... ... ...
6430 192.013558 8.684
6431 192.170412 8.667
6432 192.327265 8.667
6433 192.330854 8.667
6434 192.308899 8.667
[6435 rows x 8 columns]
import matplotlib.pyplot as plt
import scipy.stats as stats
# generate qqplot for Weekly_Sales using the converted data
fig, ax = plt.subplots()
stats.probplot(df['Weekly_Sales'], dist='norm', plot=ax)
ax.set_title('Q-Q plot for Weekly_Sales')
ax.set_xlabel('Theoretical quantiles')
ax.set_ylabel('Sample quantiles')
plt.show()
import matplotlib.pyplot as plt
# Aggregating data by 'Store' and Finding sum of 'Weekly_Sales'
Store_Sales = df.groupby(['Store']).sum()
# Changing column name of sales
Store_Sales.rename(columns={'Weekly_Sales': 'Total_Sales_by_Store'}, inplace=True)
# Finding out Store with highest Sales
Store_Sales = Store_Sales.sort_values(by=['Total_Sales_by_Store'], ascending=False)
# Choosing the first store that comes in this order
max_store = Store_Sales.index[0]
max_sales = Store_Sales.iloc[0]['Total_Sales_by_Store']
# Printing the output
print(f"Store no {max_store} has the maximum sales and the value is {max_sales}")
# Plotting the results
plt.bar(Store_Sales.index, Store_Sales['Total_Sales_by_Store'])
plt.title('Total Sales by Store')
plt.xlabel('Store')
plt.ylabel('Total Sales')
plt.show()
Store no 20 has the maximum sales and the value is 301397792.46
#Weekly sales average
weekly_sales = df[['Date', 'Weekly_Sales']].groupby('Date', as_index=False).mean()
weekly_sales['Date'] = pd.to_datetime(weekly_sales['Date'], format="%d-%m-%Y")
weekly_sales = weekly_sales.sort_values('Date').reset_index(drop=True)
weekly_sales['Date'] = pd.Categorical(weekly_sales['Date'])
#plotting weekly mean sales
plt.figure(figsize=(14,8))
b = sns.lineplot(data=weekly_sales, x="Date", y="Weekly_Sales")
b.set(xticklabels=weekly_sales['Date'][::10])
b.set_xticklabels(labels=weekly_sales['Date'][::10], rotation=90, ha='right')
plt.xlabel('Week')
plt.ylabel('Mean Sales of Week')
plt.title('Weekly Mean Sales')
plt.show()
# Finding the highest and lowest mean sales
highest_mean = weekly_sales.loc[weekly_sales['Weekly_Sales'].idxmax()]
lowest_mean = weekly_sales.loc[weekly_sales['Weekly_Sales'].idxmin()]
print(f"Highest Mean Sales: {highest_mean['Weekly_Sales']:.2f} on {highest_mean['Date'].strftime('%d-%m-%Y')}")
print(f"Lowest Mean Sales: {lowest_mean['Weekly_Sales']:.2f} on {lowest_mean['Date'].strftime('%d-%m-%Y')}")
Highest Mean Sales: 1798475.90 on 24-12-2010 Lowest Mean Sales: 879996.73 on 28-01-2011
#Holiday and Noholiday sales
df['Date'] = pd.to_datetime(df['Date'])
SuperBowl = pd.to_datetime(['12-02-2010', '11-02-2011', '10-02-2012', '08-02-2013'])
LabourDay = pd.to_datetime(['10-09-2010', '09-09-2011', '07-09-2012', '06-09-2013'])
Thanksgiving = pd.to_datetime(['26-11-2010','25-11-2011', '23-11-2012', '29-11-2013'])
Christmas = pd.to_datetime(['31-12-2010', '30-12-2011', '28-12-2012', '27-12-2013'])
walmart_h = df[['Date', 'Weekly_Sales']]
walmart_h['hflag'] = np.where(walmart_h['Date'].isin(SuperBowl), 'SB',
np.where(walmart_h['Date'].isin(LabourDay), 'LD',
np.where(walmart_h['Date'].isin(Thanksgiving), 'TG',
np.where(walmart_h['Date'].isin(Christmas), 'CH', 'None'))))
pd.options.display.float_format = '{:.0f}'.format
walmart_h_mean = walmart_h.groupby('hflag').mean()
print(walmart_h_mean)
Weekly_Sales hflag CH 960833 LD 1042427 None 1041256 SB 1079128 TG 1471273
df.columns
Index(['Store', 'Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature',
'Fuel_Price', 'CPI', 'Unemployment'],
dtype='object')
# Relabeling the columns in the dataset
df.Date=pd.to_datetime(df.Date)
df['weekday'] = df.Date.dt.weekday
df['month'] = df.Date.dt.month
df['year'] = df.Date.dt.year
# df['Monthly_Quarter'] = df.month.map({1:'Q1',2:'Q1',3:'Q1',4:'Q2',5:'Q2',6:'Q2',7:'Q3',
# 8:'Q3',9:'Q3',10:'Q4',11:'Q4',12:'Q4'})
df.drop(['Date'], axis=1, inplace=True)#,'month'
target = 'Weekly_Sales'
features = [i for i in df.columns if i not in [target]]
original_df = df.copy(deep=True)
df.head()
| Store | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | weekday | month | year | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1643691 | 0 | 42 | 3 | 211 | 8 | 6 | 5 | 2010 |
| 1 | 1 | 1641957 | 1 | 39 | 3 | 211 | 8 | 3 | 12 | 2010 |
| 2 | 1 | 1611968 | 0 | 40 | 3 | 211 | 8 | 4 | 2 | 2010 |
| 3 | 1 | 1409728 | 0 | 47 | 3 | 211 | 8 | 4 | 2 | 2010 |
| 4 | 1 | 1554807 | 0 | 46 | 3 | 211 | 8 | 0 | 5 | 2010 |
df.columns
Index(['Store', 'Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price',
'CPI', 'Unemployment', 'weekday', 'month', 'year'],
dtype='object')
#Checking the data types of all the columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 6435 entries, 0 to 6434 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Store 6435 non-null int64 1 Weekly_Sales 6435 non-null float64 2 Holiday_Flag 6435 non-null int64 3 Temperature 6435 non-null float64 4 Fuel_Price 6435 non-null float64 5 CPI 6435 non-null float64 6 Unemployment 6435 non-null float64 7 weekday 6435 non-null int64 8 month 6435 non-null int64 9 year 6435 non-null int64 dtypes: float64(5), int64(5) memory usage: 553.0 KB
#Checking number of unique rows in each feature
df.nunique().sort_values()
Holiday_Flag 2 year 3 weekday 7 month 12 Store 45 Unemployment 349 Fuel_Price 892 CPI 2145 Temperature 3528 Weekly_Sales 6435 dtype: int64
#Checking for the number of unique rows in each variable
nu = df[features].nunique().sort_values()
nf = []; cf = []; nnf = 0; ncf = 0; #categorical and numerical variables
for i in range(df[features].shape[1]):
if nu.values[i]<=45:cf.append(nu.index[i])
else: nf.append(nu.index[i])
print('\n\033[1mInference:\033[0m The Datset contains {} numerical & {} categorical features.'.format(len(nf),len(cf)))
Inference: The Datset contains 4 numerical & 5 categorical features.
#Checking the stats of all the columns
display(df.describe())
| Store | Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | weekday | month | year | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 6435 | 6435 | 6435 | 6435 | 6435 | 6435 | 6435 | 6435 | 6435 | 6435 |
| mean | 23 | 1046965 | 0 | 61 | 3 | 172 | 8 | 4 | 6 | 2011 |
| std | 13 | 564367 | 0 | 18 | 0 | 39 | 2 | 1 | 3 | 1 |
| min | 1 | 209986 | 0 | -2 | 2 | 126 | 4 | 0 | 1 | 2010 |
| 25% | 12 | 553350 | 0 | 47 | 3 | 132 | 7 | 4 | 4 | 2010 |
| 50% | 23 | 960746 | 0 | 63 | 3 | 183 | 8 | 4 | 6 | 2011 |
| 75% | 34 | 1420159 | 0 | 75 | 4 | 213 | 9 | 4 | 9 | 2012 |
| max | 45 | 3818686 | 1 | 100 | 4 | 227 | 14 | 6 | 12 | 2012 |
#Analyzing the distribution of the target variable
plt.figure(figsize=[8,4])
sns.distplot(df[target], color='g',hist_kws=dict(edgecolor="black", linewidth=2), bins=30)
plt.title('Target Variable Distribution - Median Value of Sales ($1Ms)')
plt.show()
#Visualisation of the categorical features
print('\033[1mVisualising the Categorical Features:'.center(100))
n=2
plt.figure(figsize=[15,3*math.ceil(len(cf)/n)])
for i in range(len(cf)):
if df[cf[i]].nunique()<=8:
plt.subplot(math.ceil(len(cf)/n),n,i+1)
sns.countplot(df[cf[i]])
else:
plt.subplot(3,1,i-1)
sns.countplot(df[cf[i]])
plt.tight_layout()
plt.show()
Visualising the Categorical Features:
#Visualisation of the numeric variables
print('\033[1mNumeric variables Distribution'.center(130))
n=4
clr=['r','g','b','g','b','r']
plt.figure(figsize=[15,6*math.ceil(len(nf)/n)])
for i in range(len(nf)):
plt.subplot(math.ceil(len(nf)/3),n,i+1)
sns.distplot(df[nf[i]],hist_kws=dict(edgecolor="black",
linewidth=2), bins=10, color=list(np.random.randint([255,255,255])/255))
plt.tight_layout()
plt.show()
plt.figure(figsize=[15,6*math.ceil(len(nf)/n)])
for i in range(len(nf)):
plt.subplot(math.ceil(len(nf)/3),n,i+1)
df.boxplot(nf[i])
plt.tight_layout()
plt.show()
Numeric variables Distribution
#Understanding the relationship between all the variables in the dataset
g = sns.pairplot(df)
plt.title('Pairplots for all the Variables')
g.map_upper(sns.kdeplot, levels=4, color=".2")
plt.show()
#Check for empty elements
nvc = pd.DataFrame(df.isnull().sum().sort_values(), columns=['Total Null Values'])
nvc['Percentage'] = round(nvc['Total Null Values']/df.shape[0],3)*100
print(nvc)
Total Null Values Percentage Store 0 0 Weekly_Sales 0 0 Holiday_Flag 0 0 Temperature 0 0 Fuel_Price 0 0 CPI 0 0 Unemployment 0 0 weekday 0 0 month 0 0 year 0 0
#Converting categorical variables to Numeric variables
df3 = df.copy()
ecc = nvc[nvc['Percentage']!=0].index.values
fcc = [i for i in cf if i not in ecc]
#One-Hot Binary Encoding
oh=True
dm=True
for i in fcc:
#print(i)
if df3[i].nunique()==2:
if oh==True: print("\033[1mOne-Hot Encoding on variables:\033[0m")
print(i);oh=False
df3[i]=pd.get_dummies(df3[i], drop_first=True, prefix=str(i))
if (df3[i].nunique()>2):
if dm==True: print("\n\033[1mDummy Encoding on variables:\033[0m")
print(i);dm=False
df3 = pd.concat([df3.drop([i], axis=1), pd.DataFrame(pd.get_dummies(df3[i],
drop_first=True, prefix=str(i)))],axis=1)
df3.shape
One-Hot Encoding on variables: Holiday_Flag Dummy Encoding on variables: year weekday month Store
(6435, 69)
#Removal of outliers:
df1 = df3.copy()
#features1 = [i for i in features if i not in ['RAD','CHAS']]
features1 = nf
for i in features1:
Q1 = df1[i].quantile(0.25)
Q3 = df1[i].quantile(0.75)
IQR = Q3 - Q1
df1 = df1[df1[i] <= (Q3+(1.5*IQR))]
df1 = df1[df1[i] >= (Q1-(1.5*IQR))]
df1 = df1.reset_index(drop=True)
display(df1.head())
print('\n\033[1mInference:\033[0m\nBefore removal of outliers, the dataset contains {} samples.'
.format(df3.shape[0]))
print('After removal of outliers, the dataset contains {} samples.'.format(df1.shape[0]))
| Weekly_Sales | Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year_2011 | year_2012 | weekday_1 | weekday_2 | ... | Store_36 | Store_37 | Store_38 | Store_39 | Store_40 | Store_41 | Store_42 | Store_43 | Store_44 | Store_45 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1643691 | 0 | 42 | 3 | 211 | 8 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1641957 | 1 | 39 | 3 | 211 | 8 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1611968 | 0 | 40 | 3 | 211 | 8 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1409728 | 0 | 47 | 3 | 211 | 8 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1554807 | 0 | 46 | 3 | 211 | 8 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 69 columns
Inference:
Before removal of outliers, the dataset contains 6435 samples.
After removal of outliers, the dataset contains 5953 samples.
#Final Dataset size after performing Preprocessing
df = df1.copy()
df.columns=[i.replace('-','_') for i in df.columns]
plt.title('Final Dataset')
plt.pie([df.shape[0], original_df.shape[0]-df.shape[0]], radius = 1, labels=['Retained','Dropped'], counterclock=False,
autopct='%1.1f%%', pctdistance=0.9, explode=[0,0], shadow=True)
plt.pie([df.shape[0]], labels=['100%'], labeldistance=-0, radius=0.78)
plt.show()
#Visualizing the correlation matrix
print('\033[1mCorrelation Matrix'.center(100))
plt.figure(figsize=[25,20])
sns.heatmap(df.corr(), annot=True, fmt='.2f', vmin=-1, vmax=1, center=0) #cmap='BuGn'
plt.show()
Correlation Matrix
#Splitting the data into training and testing sets
m=[]
for i in df.columns.values:
m.append(i.replace(' ','_'))
df.columns = m
X = df.drop([target],axis=1)
Y = df[target]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=100)
Train_X.reset_index(drop=True,inplace=True)
print('Original data set ',X.shape,Y.shape,'\nTraining data set ',
Train_X.shape,Train_Y.shape,'\nTesting data set ', Test_X.shape,'', Test_Y.shape)
Original data set (5953, 68) (5953,) Training data set (4167, 68) (4167,) Testing data set (1786, 68) (1786,)
#Performing Standardization
std = StandardScaler()
print('\033[1mStandardardization on Training data set'.center(120))
Train_X_std = std.fit_transform(Train_X)
Train_X_std = pd.DataFrame(Train_X_std, columns=X.columns)
display(Train_X_std.describe())
print('\n','\033[1mStandardardization on Testing data set'.center(120))
Test_X_std = std.transform(Test_X)
Test_X_std = pd.DataFrame(Test_X_std, columns=X.columns)
display(Test_X_std.describe())
Standardardization on Training data set
| Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year_2011 | year_2012 | weekday_1 | weekday_2 | weekday_3 | ... | Store_36 | Store_37 | Store_38 | Store_39 | Store_40 | Store_41 | Store_42 | Store_43 | Store_44 | Store_45 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 | ... | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 | 4167 |
| mean | 0 | 0 | -0 | -0 | -0 | 0 | -0 | -0 | 0 | 0 | ... | 0 | -0 | -0 | 0 | -0 | 0 | -0 | -0 | 0 | -0 |
| std | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| min | -0 | -3 | -2 | -1 | -3 | -1 | -1 | -0 | -0 | -0 | ... | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 |
| 25% | -0 | -1 | -1 | -1 | -1 | -1 | -1 | -0 | -0 | -0 | ... | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 |
| 50% | -0 | 0 | 0 | 0 | 0 | -1 | -1 | -0 | -0 | -0 | ... | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 |
| 75% | -0 | 1 | 1 | 1 | 1 | 1 | 2 | -0 | -0 | -0 | ... | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 |
| max | 4 | 2 | 2 | 1 | 3 | 1 | 2 | 4 | 9 | 4 | ... | 6 | 6 | 23 | 6 | 8 | 7 | 6 | 6 | 7 | 7 |
8 rows × 68 columns
Standardardization on Testing data set
| Holiday_Flag | Temperature | Fuel_Price | CPI | Unemployment | year_2011 | year_2012 | weekday_1 | weekday_2 | weekday_3 | ... | Store_36 | Store_37 | Store_38 | Store_39 | Store_40 | Store_41 | Store_42 | Store_43 | Store_44 | Store_45 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 | ... | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 | 1786 |
| mean | 0 | 0 | 0 | 0 | -0 | 0 | -0 | 0 | 0 | 0 | ... | 0 | -0 | 0 | 0 | 0 | 0 | -0 | -0 | 0 | 0 |
| std | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| min | -0 | -3 | -2 | -1 | -3 | -1 | -1 | -0 | -0 | -0 | ... | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 |
| 25% | -0 | -1 | -1 | -1 | -1 | -1 | -1 | -0 | -0 | -0 | ... | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 |
| 50% | -0 | 0 | 0 | 0 | 0 | -1 | -1 | -0 | -0 | -0 | ... | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 |
| 75% | -0 | 1 | 1 | 1 | 1 | 1 | 2 | -0 | -0 | -0 | ... | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 | -0 |
| max | 4 | 2 | 2 | 1 | 3 | 1 | 2 | 4 | 9 | 4 | ... | 6 | 6 | 23 | 6 | 8 | 7 | 6 | 6 | 7 | 7 |
8 rows × 68 columns
#Testing a Linear Regression model with statsmodels
Train_xy = pd.concat([Train_X_std,Train_Y.reset_index(drop=True)],axis=1)
a = Train_xy.columns.values
API = api.ols(formula='{} ~ {}'.format(target,' + '.join(i for i in Train_X.columns)), data=Train_xy).fit()
#print(API.conf_int())
#print(API.pvalues)
API.summary()
| Dep. Variable: | Weekly_Sales | R-squared: | 0.931 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.930 |
| Method: | Least Squares | F-statistic: | 819.4 |
| Date: | Mon, 27 Mar 2023 | Prob (F-statistic): | 0.00 |
| Time: | 17:01:33 | Log-Likelihood: | -55576. |
| No. Observations: | 4167 | AIC: | 1.113e+05 |
| Df Residuals: | 4098 | BIC: | 1.117e+05 |
| Df Model: | 68 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 1.051e+06 | 2342.620 | 448.804 | 0.000 | 1.05e+06 | 1.06e+06 |
| Holiday_Flag | 4498.6227 | 2653.683 | 1.695 | 0.090 | -704.037 | 9701.282 |
| Temperature | -1.108e+04 | 3914.719 | -2.829 | 0.005 | -1.88e+04 | -3400.650 |
| Fuel_Price | 7066.0594 | 6618.561 | 1.068 | 0.286 | -5909.915 | 2e+04 |
| CPI | 5.058e+05 | 7.09e+04 | 7.133 | 0.000 | 3.67e+05 | 6.45e+05 |
| Unemployment | -6.351e+04 | 7946.015 | -7.993 | 0.000 | -7.91e+04 | -4.79e+04 |
| year_2011 | -3.383e+04 | 6928.894 | -4.882 | 0.000 | -4.74e+04 | -2.02e+04 |
| year_2012 | -7.531e+04 | 9641.447 | -7.811 | 0.000 | -9.42e+04 | -5.64e+04 |
| weekday_1 | 6114.1130 | 3290.061 | 1.858 | 0.063 | -336.193 | 1.26e+04 |
| weekday_2 | -1.125e+04 | 2766.375 | -4.067 | 0.000 | -1.67e+04 | -5826.151 |
| weekday_3 | -1.331e+04 | 3300.944 | -4.032 | 0.000 | -1.98e+04 | -6839.216 |
| weekday_4 | -1.538e+04 | 4407.517 | -3.491 | 0.000 | -2.4e+04 | -6743.868 |
| weekday_5 | -1.239e+04 | 3050.293 | -4.062 | 0.000 | -1.84e+04 | -6409.791 |
| weekday_6 | -1601.3677 | 3340.154 | -0.479 | 0.632 | -8149.883 | 4947.148 |
| month_2 | 2.829e+04 | 3450.045 | 8.201 | 0.000 | 2.15e+04 | 3.51e+04 |
| month_3 | 1.983e+04 | 3515.224 | 5.642 | 0.000 | 1.29e+04 | 2.67e+04 |
| month_4 | 2.002e+04 | 3735.968 | 5.358 | 0.000 | 1.27e+04 | 2.73e+04 |
| month_5 | 2.055e+04 | 3749.128 | 5.482 | 0.000 | 1.32e+04 | 2.79e+04 |
| month_6 | 3.053e+04 | 3528.249 | 8.654 | 0.000 | 2.36e+04 | 3.75e+04 |
| month_7 | 1.788e+04 | 3765.347 | 4.750 | 0.000 | 1.05e+04 | 2.53e+04 |
| month_8 | 2.283e+04 | 3647.472 | 6.259 | 0.000 | 1.57e+04 | 3e+04 |
| month_9 | 1.021e+04 | 3735.982 | 2.734 | 0.006 | 2887.962 | 1.75e+04 |
| month_10 | 1.466e+04 | 3810.604 | 3.846 | 0.000 | 7186.422 | 2.21e+04 |
| month_11 | 4.165e+04 | 3511.127 | 11.863 | 0.000 | 3.48e+04 | 4.85e+04 |
| month_12 | 6.25e+04 | 3845.898 | 16.252 | 0.000 | 5.5e+04 | 7e+04 |
| Store_2 | 5.856e+04 | 3299.068 | 17.750 | 0.000 | 5.21e+04 | 6.5e+04 |
| Store_3 | -1.87e+05 | 3424.808 | -54.595 | 0.000 | -1.94e+05 | -1.8e+05 |
| Store_4 | 2.234e+05 | 2.29e+04 | 9.768 | 0.000 | 1.79e+05 | 2.68e+05 |
| Store_5 | -1.932e+05 | 3437.689 | -56.200 | 0.000 | -2e+05 | -1.86e+05 |
| Store_6 | -1.019e+04 | 3446.121 | -2.958 | 0.003 | -1.7e+04 | -3438.411 |
| Store_7 | -9.879e+04 | 6705.936 | -14.731 | 0.000 | -1.12e+05 | -8.56e+04 |
| Store_8 | -1.212e+05 | 3772.879 | -32.129 | 0.000 | -1.29e+05 | -1.14e+05 |
| Store_9 | -1.641e+05 | 3569.151 | -45.972 | 0.000 | -1.71e+05 | -1.57e+05 |
| Store_10 | 2.315e+05 | 2.44e+04 | 9.475 | 0.000 | 1.84e+05 | 2.79e+05 |
| Store_11 | -3.924e+04 | 3365.721 | -11.658 | 0.000 | -4.58e+04 | -3.26e+04 |
| Store_12 | 4.479e+04 | 9737.960 | 4.600 | 0.000 | 2.57e+04 | 6.39e+04 |
| Store_13 | 2.298e+05 | 2.4e+04 | 9.578 | 0.000 | 1.83e+05 | 2.77e+05 |
| Store_14 | 1.408e+05 | 8886.353 | 15.842 | 0.000 | 1.23e+05 | 1.58e+05 |
| Store_15 | 1.675e+04 | 2.26e+04 | 0.742 | 0.458 | -2.75e+04 | 6.1e+04 |
| Store_16 | -1.261e+05 | 7116.715 | -17.724 | 0.000 | -1.4e+05 | -1.12e+05 |
| Store_17 | 6.131e+04 | 2.5e+04 | 2.453 | 0.014 | 1.23e+04 | 1.1e+05 |
| Store_18 | 9.59e+04 | 2.3e+04 | 4.171 | 0.000 | 5.08e+04 | 1.41e+05 |
| Store_19 | 1.41e+05 | 2.23e+04 | 6.331 | 0.000 | 9.73e+04 | 1.85e+05 |
| Store_20 | 9.556e+04 | 3833.227 | 24.929 | 0.000 | 8.8e+04 | 1.03e+05 |
| Store_21 | -1.244e+05 | 3306.298 | -37.632 | 0.000 | -1.31e+05 | -1.18e+05 |
| Store_22 | 7.636e+04 | 2.21e+04 | 3.449 | 0.001 | 3.3e+04 | 1.2e+05 |
| Store_23 | 9.665e+04 | 2.03e+04 | 4.772 | 0.000 | 5.69e+04 | 1.36e+05 |
| Store_24 | 1.334e+05 | 2.26e+04 | 5.912 | 0.000 | 8.91e+04 | 1.78e+05 |
| Store_25 | -1.25e+05 | 3890.176 | -32.138 | 0.000 | -1.33e+05 | -1.17e+05 |
| Store_26 | 7.587e+04 | 2.29e+04 | 3.319 | 0.001 | 3.1e+04 | 1.21e+05 |
| Store_27 | 1.91e+05 | 2.18e+04 | 8.745 | 0.000 | 1.48e+05 | 2.34e+05 |
| Store_28 | 5.208e+04 | 8700.647 | 5.986 | 0.000 | 3.5e+04 | 6.91e+04 |
| Store_29 | 1.971e+04 | 2.34e+04 | 0.843 | 0.399 | -2.61e+04 | 6.56e+04 |
| Store_30 | -1.756e+05 | 3307.046 | -53.085 | 0.000 | -1.82e+05 | -1.69e+05 |
| Store_31 | -2.156e+04 | 3191.610 | -6.756 | 0.000 | -2.78e+04 | -1.53e+04 |
| Store_32 | -8966.6532 | 6916.297 | -1.296 | 0.195 | -2.25e+04 | 4593.044 |
| Store_33 | -1.93e+04 | 2.46e+04 | -0.784 | 0.433 | -6.75e+04 | 2.9e+04 |
| Store_34 | 9.881e+04 | 2.42e+04 | 4.082 | 0.000 | 5.14e+04 | 1.46e+05 |
| Store_35 | 6.249e+04 | 2.1e+04 | 2.969 | 0.003 | 2.12e+04 | 1.04e+05 |
| Store_36 | -1.758e+05 | 3286.652 | -53.497 | 0.000 | -1.82e+05 | -1.69e+05 |
| Store_37 | -1.579e+05 | 3341.738 | -47.236 | 0.000 | -1.64e+05 | -1.51e+05 |
| Store_38 | 1.107e+04 | 7505.460 | 1.474 | 0.140 | -3649.149 | 2.58e+04 |
| Store_39 | -1.049e+04 | 3284.031 | -3.194 | 0.001 | -1.69e+04 | -4050.460 |
| Store_40 | 3.619e+04 | 1.85e+04 | 1.951 | 0.051 | -172.130 | 7.25e+04 |
| Store_41 | -7639.6166 | 6874.380 | -1.111 | 0.266 | -2.11e+04 | 5837.901 |
| Store_42 | 2.521e+04 | 2.5e+04 | 1.008 | 0.314 | -2.38e+04 | 7.42e+04 |
| Store_43 | -1.086e+05 | 4576.890 | -23.730 | 0.000 | -1.18e+05 | -9.96e+04 |
| Store_44 | -2.663e+04 | 2.4e+04 | -1.111 | 0.267 | -7.36e+04 | 2.04e+04 |
| Store_45 | -5.128e+04 | 8473.219 | -6.052 | 0.000 | -6.79e+04 | -3.47e+04 |
| Omnibus: | 2974.895 | Durbin-Watson: | 2.000 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 97387.972 |
| Skew: | 3.002 | Prob(JB): | 0.00 |
| Kurtosis: | 25.910 | Cond. No. | 80.6 |
from sklearn.preprocessing import PolynomialFeatures
Trr=[]; Tss=[]; n=3
order=['ord-'+str(i) for i in range(2,n)]
#Trd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
#Tsd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
DROP=[];b=[]
for i in range(len(Train_X_std.columns)):
vif = pd.DataFrame()
X = Train_X_std.drop(DROP,axis=1)
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif.reset_index(drop=True, inplace=True)
if vif.loc[0][1]>1:
DROP.append(vif.loc[0][0])
LR = LinearRegression()
LR.fit(Train_X_std.drop(DROP,axis=1), Train_Y)
pred1 = LR.predict(Train_X_std.drop(DROP,axis=1))
pred2 = LR.predict(Test_X_std.drop(DROP,axis=1))
Trr.append(np.sqrt(mean_squared_error(Train_Y, pred1)))
Tss.append(np.sqrt(mean_squared_error(Test_Y, pred2)))
print('Dropped Features --> ',DROP)
undropped_columns = list(set(Train_X_std.columns) - set(DROP))
print("Undropped columns:", undropped_columns)
plt.plot(Trr, label='Train RMSE')
plt.plot(Tss, label='Test RMSE')
plt.legend()
plt.grid()
plt.show()
Dropped Features --> ['CPI', 'Unemployment', 'Fuel_Price', 'weekday_4', 'month_7', 'Store_17', 'Temperature', 'month_12', 'year_2012', 'Store_29', 'month_2', 'Store_2', 'month_11', 'Store_27', 'month_5', 'Store_16', 'Store_18', 'month_10', 'Store_22', 'Holiday_Flag', 'year_2011', 'Store_33', 'Store_21', 'month_9', 'Store_19', 'Store_25', 'Store_15', 'Store_14', 'Store_37', 'month_4', 'Store_20', 'Store_24', 'Store_13', 'Store_45', 'Store_44', 'Store_43', 'weekday_5', 'month_8', 'Store_42', 'Store_11', 'Store_41', 'Store_10', 'weekday_3', 'Store_6', 'weekday_1', 'weekday_2', 'Store_26', 'Store_23', 'Store_40', 'Store_39', 'Store_35', 'Store_9', 'month_3', 'Store_3', 'Store_4', 'Store_8'] Undropped columns: ['Store_7', 'Store_38', 'month_6', 'Store_30', 'Store_32', 'Store_28', 'Store_34', 'weekday_6', 'Store_12', 'Store_5', 'Store_36', 'Store_31']
from sklearn.preprocessing import PolynomialFeatures
Trr=[]; Tss=[]; n=3
order=['ord-'+str(i) for i in range(2,n)]
Trd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
Tsd = pd.DataFrame(np.zeros((10,n-2)), columns=order)
m=df.shape[1]-2
for i in range(m):
lm = LinearRegression()
rfe = RFE(lm,n_features_to_select=Train_X_std.shape[1]-i)
rfe = rfe.fit(Train_X_std, Train_Y)
LR = LinearRegression()
LR.fit(Train_X_std.loc[:,rfe.support_], Train_Y)
pred1 = LR.predict(Train_X_std.loc[:,rfe.support_])
pred2 = LR.predict(Test_X_std.loc[:,rfe.support_])
Trr.append(np.sqrt(mean_squared_error(Train_Y, pred1)))
Tss.append(np.sqrt(mean_squared_error(Test_Y, pred2)))
plt.plot(Trr, label='Train RMSE')
plt.plot(Tss, label='Test RMSE')
plt.legend()
plt.grid()
plt.show()
#Shortlisting the selected Features (with RFE)
lm = LinearRegression()
rfe = RFE(lm,n_features_to_select=Train_X_std.shape[1]-28)
rfe = rfe.fit(Train_X_std, Train_Y)
LR = LinearRegression()
LR.fit(Train_X_std.loc[:,rfe.support_], Train_Y)
print(Train_X_std.loc[:,rfe.support_].columns)
pred1 = LR.predict(Train_X_std.loc[:,rfe.support_])
pred2 = LR.predict(Test_X_std.loc[:,rfe.support_])
print(np.sqrt(mean_squared_error(Train_Y, pred1)))
print(np.sqrt(mean_squared_error(Test_Y, pred2)))
Train_X_std = Train_X_std.loc[:,rfe.support_]
Test_X_std = Test_X_std.loc[:,rfe.support_]
Index(['CPI', 'Unemployment', 'year_2011', 'year_2012', 'month_12', 'Store_2',
'Store_3', 'Store_4', 'Store_5', 'Store_7', 'Store_8', 'Store_9',
'Store_10', 'Store_12', 'Store_13', 'Store_14', 'Store_15', 'Store_16',
'Store_17', 'Store_18', 'Store_19', 'Store_20', 'Store_21', 'Store_22',
'Store_23', 'Store_24', 'Store_25', 'Store_26', 'Store_27', 'Store_28',
'Store_29', 'Store_30', 'Store_34', 'Store_35', 'Store_36', 'Store_37',
'Store_40', 'Store_42', 'Store_43', 'Store_45'],
dtype='object')
159874.46675185006
156505.19663797048
#Multiple Linear Regression
MLR = LinearRegression().fit(Train_X_std,Train_Y)
pred1 = MLR.predict(Train_X_std)
pred2 = MLR.predict(Test_X_std)
print('The Intercept of the Regresion Model was found to be',MLR.intercept_)
coefficients = pd.Series(MLR.coef_, index=Train_X_std.columns)
print('The Coefficients of the Regression Model are found to be:\n', coefficients)
Evaluate(0, pred1, pred2)
The Intercept of the Regresion Model was found to be 1051376.4668682504 The Coefficients of the Regression Model are found to be: CPI 519675 Unemployment -41407 year_2011 -30764 year_2012 -66636 month_12 43592 Store_2 72217 Store_3 -172131 Store_4 244683 Store_5 -176496 Store_7 -84069 Store_8 -102505 Store_9 -147021 Store_10 248305 Store_12 47520 Store_13 250766 Store_14 154549 Store_15 36362 Store_16 -106276 Store_17 85044 Store_18 113528 Store_19 160259 Store_20 112408 Store_21 -110596 Store_22 96050 Store_23 121077 Store_24 150807 Store_25 -107991 Store_26 96052 Store_27 209716 Store_28 53792 Store_29 34176 Store_30 -161345 Store_34 111658 Store_35 77974 Store_36 -163099 Store_37 -145148 Store_40 58120 Store_42 42934 Store_43 -100386 Store_45 -38329 dtype: float64
#Evaluating the models
Model_Evaluation_Comparison_Matrix = pd.DataFrame(np.zeros([5,8]),
columns=['Train-R2','Test-R2','Train-RSS','Test-RSS',
'Train-MSE','Test-MSE','Train-RMSE','Test-RMSE'])
rc=np.random.choice(Train_X_std.loc[:,Train_X_std.nunique()>=50].columns.values,2,replace=False)
def Evaluate(n, pred1,pred2):
#Plotting predicted predicteds alongside the actual datapoints
plt.figure(figsize=[15,6])
for e,i in enumerate(rc):
plt.subplot(2,3,e+1)
plt.scatter(y=Train_Y, x=Train_X_std[i], label='Actual')
plt.scatter(y=pred1, x=Train_X_std[i], label='Prediction')
plt.legend()
plt.show()
#Evaluating the results of Multiple Linear Regression Model
print('\n\n{}Training Set Metrics{}'.format('-'*20, '-'*20))
print('\nR2-Score on Training set --->',round(r2_score(Train_Y, pred1),20))
print('Residual Sum of Squares (RSS) on Training set --->',round(np.sum(np.square(Train_Y-pred1)),20))
print('Mean Squared Error (MSE) on Training set --->',round(mean_squared_error(Train_Y, pred1),20))
print('Root Mean Squared Error (RMSE) on Training set --->',round(np.sqrt(mean_squared_error(Train_Y, pred1)),20))
print('\n{}Testing Set Metrics{}'.format('-'*20, '-'*20))
print('\nR2-Score on Testing set --->',round(r2_score(Test_Y, pred2),20))
print('Residual Sum of Squares (RSS) on Testing set --->',round(np.sum(np.square(Test_Y-pred2)),20))
print('Mean Squared Error (MSE) on Testing set --->',round(mean_squared_error(Test_Y, pred2),20))
print('Root Mean Squared Error (RMSE) on Testing set --->',round(np.sqrt(mean_squared_error(Test_Y, pred2)),20))
print('\n{}Residual Plots{}'.format('-'*20, '-'*20))
Model_Evaluation_Comparison_Matrix.loc[n,'Train-R2'] = round(r2_score(Train_Y, pred1),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Test-R2'] = round(r2_score(Test_Y, pred2),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Train-RSS'] = round(np.sum(np.square(Train_Y-pred1)),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Test-RSS'] = round(np.sum(np.square(Test_Y-pred2)),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Train-MSE'] = round(mean_squared_error(Train_Y, pred1),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Test-MSE'] = round(mean_squared_error(Test_Y, pred2),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Train-RMSE']= round(np.sqrt(mean_squared_error(Train_Y, pred1)),20)
Model_Evaluation_Comparison_Matrix.loc[n,'Test-RMSE'] = round(np.sqrt(mean_squared_error(Test_Y, pred2)),20)
# Plotting y_test and y_pred and understanding the spread.
plt.figure(figsize=[15,4])
plt.subplot(1,2,1)
sns.distplot((Train_Y - pred1))
plt.title('Error Terms')
plt.xlabel('Errors')
plt.subplot(1,2,2)
plt.scatter(Train_Y,pred1)
plt.plot([Train_Y.min(),Train_Y.max()],[Train_Y.min(),Train_Y.max()], 'r--')
plt.title('Test data vs Prediction')
plt.xlabel('y_test')
plt.ylabel('y_pred')
plt.show()
--------------------Training Set Metrics-------------------- R2-Score on Training set ---> 0.9221377512636295 Residual Sum of Squares (RSS) on Training set ---> 106507874611658.14 Mean Squared Error (MSE) on Training set ---> 25559845119.188416 Root Mean Squared Error (RMSE) on Training set ---> 159874.46675185006 --------------------Testing Set Metrics-------------------- R2-Score on Testing set ---> 0.9246903759637972 Residual Sum of Squares (RSS) on Testing set ---> 43746063562396.0 Mean Squared Error (MSE) on Testing set ---> 24493876574.689808 Root Mean Squared Error (RMSE) on Testing set ---> 156505.19663797048 --------------------Residual Plots--------------------
# Save the train and test datasets as CSV files in the specified directory
Train_X.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/train_X.csv', index=False)
Test_X.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/test_X.csv', index=False)
Train_Y.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/train_Y.csv', index=False)
Test_Y.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/test_Y.csv', index=False)
df.to_csv('/Users/Kavya/Desktop/MSDA_Kavya/MSDA_D214/df.csv', index=False)